MoneyBallR_Modeling

## Skipping install of 'ggbiplot' from a github remote, the SHA1 (7325e880) has not changed since last install.
##   Use `force = TRUE` to force installation
## Loading required package: ggplot2
## Loading required package: plyr
## Loading required package: scales
## Loading required package: grid

#Load in the Data

## -- Attaching packages -------------------------------------------------------------------------------------------------------------- tidyverse 1.2.1 --
## v tibble  2.1.1       v purrr   0.3.2  
## v tidyr   0.8.3       v dplyr   0.8.0.1
## v readr   1.3.1       v stringr 1.4.0  
## v tibble  2.1.1       v forcats 0.4.0
## -- Conflicts ----------------------------------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::arrange()    masks plyr::arrange()
## x readr::col_factor() masks scales::col_factor()
## x purrr::compact()    masks plyr::compact()
## x dplyr::count()      masks plyr::count()
## x purrr::discard()    masks scales::discard()
## x dplyr::failwith()   masks plyr::failwith()
## x dplyr::filter()     masks stats::filter()
## x dplyr::id()         masks plyr::id()
## x dplyr::lag()        masks stats::lag()
## x dplyr::mutate()     masks plyr::mutate()
## x dplyr::rename()     masks plyr::rename()
## x dplyr::summarise()  masks plyr::summarise()
## x dplyr::summarize()  masks plyr::summarize()
## 
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
## 
##     group_rows
## corrplot 0.84 loaded
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
## 
## Attaching package: 'DMwR'
## The following object is masked from 'package:plyr':
## 
##     join
## Rattle: A free graphical interface for data science with R.
## Version 5.2.0 Copyright (c) 2006-2018 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.
## Loading required package: mvtnorm
## Loading required package: modeltools
## Loading required package: stats4
## 
## Attaching package: 'modeltools'
## The following object is masked from 'package:plyr':
## 
##     empty
## Loading required package: strucchange
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## Loading required package: sandwich
## 
## Attaching package: 'strucchange'
## The following object is masked from 'package:stringr':
## 
##     boundary
## Loading required package: libcoin
## 
## Attaching package: 'partykit'
## The following objects are masked from 'package:party':
## 
##     cforest, ctree, ctree_control, edge_simple, mob, mob_control,
##     node_barplot, node_bivplot, node_boxplot, node_inner,
##     node_surv, node_terminal, varimp
## 
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
## The following object is masked from 'package:purrr':
## 
##     transpose
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:rattle':
## 
##     importance
## The following object is masked from 'package:dplyr':
## 
##     combine
## The following object is masked from 'package:ggplot2':
## 
##     margin
## Loading required package: MASS
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
## 
## Attaching package: 'memisc'
## The following objects are masked from 'package:modeltools':
## 
##     Lapply, relabel
## The following objects are masked from 'package:dplyr':
## 
##     collect, recode, rename, syms
## The following object is masked from 'package:purrr':
## 
##     %@%
## The following object is masked from 'package:scales':
## 
##     percent
## The following object is masked from 'package:plyr':
## 
##     rename
## The following object is masked from 'package:ggplot2':
## 
##     syms
## The following objects are masked from 'package:stats':
## 
##     contr.sum, contr.treatment, contrasts
## The following object is masked from 'package:base':
## 
##     as.array
## 
## Attaching package: 'plotly'
## The following objects are masked from 'package:memisc':
## 
##     rename, style
## The following object is masked from 'package:MASS':
## 
##     select
## The following objects are masked from 'package:plyr':
## 
##     arrange, mutate, rename, summarise
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout

#Missing Variables

Variable Missing Data Number of Records Share of Total
BATTING_HBP 2085 92%
BASERUN_CS 772 34%
FIELDING_DP 286 13%
BASERUN_SB 131 5.8%
BATTING_SO 102 4.5%
PITCHING_SO 102 4.5%

See if we can impute some misssing values Using a Random Forest

df_train[which(df_train$TARGET_WINS == 0), "TARGET_WINS"] <- round(mean(df_train$TARGET_WINS), 0)

# Change 0's to NA so they too can be imputed
df_train <- df_train %>%
  mutate(BATTING_SO = ifelse(BATTING_SO == 0, NA, BATTING_SO))
knn <- df_train %>% knnImputation()
impute_me <- is.na(df_train$BATTING_SO)
df_train[impute_me,"BATTING_SO"] <- knn[impute_me,"BATTING_SO"] 

impute_me <- is.na(df_train$BASERUN_SB)
df_train[impute_me,"BASERUN_SB"] <- knn[impute_me,"BASERUN_SB"] 

impute_me <- is.na(df_train$BASERUN_CS)
df_train[impute_me,"BASERUN_CS"] <- knn[impute_me,"BASERUN_CS"]

impute_me <- is.na(df_train$BATTING_HBP)
df_train[impute_me,"BATTING_HBP"] <- knn[impute_me,"BATTING_HBP"] 

df_train[which(df_train$PITCHING_SO > 5346),"PITCHING_SO"] <- NA
impute_me <- is.na(df_train$PITCHING_SO)
df_train[impute_me,"PITCHING_SO"] <- knn[impute_me,"PITCHING_SO"]

impute_me <- is.na(df_train$FIELDING_DP)
df_train[impute_me,"FIELDING_DP"] <- knn[impute_me,"FIELDING_DP"] 

# Change 0's to NA so they too can be imputed
df_test <- df_test %>%
  mutate(BATTING_SO = ifelse(BATTING_SO == 0, NA, BATTING_SO))
knn <- df_test %>% knnImputation()
impute_me <- is.na(df_test$BATTING_SO)
df_test[impute_me,"BATTING_SO"] <- knn[impute_me,"BATTING_SO"] 

impute_me <- is.na(df_test$BASERUN_SB)
df_test[impute_me,"BASERUN_SB"] <- knn[impute_me,"BASERUN_SB"] 

impute_me <- is.na(df_test$BASERUN_CS)
df_test[impute_me,"BASERUN_CS"] <- knn[impute_me,"BASERUN_CS"]

impute_me <- is.na(df_test$BATTING_HBP)
df_test[impute_me,"BATTING_HBP"] <- knn[impute_me,"BATTING_HBP"] 

df_test[which(df_test$PITCHING_SO > 5346),"PITCHING_SO"] <- NA
impute_me <- is.na(df_test$PITCHING_SO)
df_test[impute_me,"PITCHING_SO"] <- knn[impute_me,"PITCHING_SO"]

impute_me <- is.na(df_test$FIELDING_DP)
df_test[impute_me,"FIELDING_DP"] <- knn[impute_me,"FIELDING_DP"] 

#Correlation Matrix

## Loading required package: xts
## 
## Attaching package: 'xts'
## The following objects are masked from 'package:data.table':
## 
##     first, last
## The following objects are masked from 'package:dplyr':
## 
##     first, last
## 
## Attaching package: 'PerformanceAnalytics'
## The following object is masked from 'package:graphics':
## 
##     legend

#Feature Creation and Data Cleaning

#Look at Linear Model with all First Order Predictors

## Warning in predict.lm(model, hold_out_data): prediction from a rank-
## deficient fit may be misleading
## 
## Call:
## lm(formula = TARGET_WINS ~ ., data = train_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -45.832  -8.562  -0.082   8.595  68.412 
## 
## Coefficients: (1 not defined because of singularities)
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1.4283905  7.5603316  -0.189 0.850170    
## BATTING_H    0.0498665  0.0043376  11.496  < 2e-16 ***
## BATTING_2B  -0.0203716  0.0111070  -1.834 0.066823 .  
## BATTING_3B   0.0734573  0.0209907   3.500 0.000479 ***
## BATTING_HR   0.0432008  0.0337196   1.281 0.200320    
## BATTING_BB   0.0181073  0.0071850   2.520 0.011828 *  
## BATTING_SO   0.0004028  0.0026430   0.152 0.878876    
## BASERUN_SB   0.0248181  0.0052309   4.744 2.28e-06 ***
## BASERUN_CS  -0.0125798  0.0183880  -0.684 0.493991    
## BATTING_HBP  0.2719421  0.0734697   3.701 0.000222 ***
## PITCHING_H   0.0003889  0.0004152   0.937 0.349036    
## PITCHING_HR -0.0084095  0.0294120  -0.286 0.774977    
## PITCHING_BB -0.0015055  0.0051259  -0.294 0.769016    
## PITCHING_SO  0.0021441  0.0014467   1.482 0.138504    
## FIELDING_E  -0.0179990  0.0029202  -6.164 9.00e-10 ***
## FIELDING_DP -0.1206721  0.0153576  -7.857 7.19e-15 ***
## BATTING_1B          NA         NA      NA       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 13.37 on 1579 degrees of freedom
## Multiple R-squared:  0.2956, Adjusted R-squared:  0.2889 
## F-statistic: 44.17 on 15 and 1579 DF,  p-value: < 2.2e-16

#Look at some Decision Trees for Variable Interactions

##Look at Variance Inflation Factors

## [1] "BATTING_2B"
## [1] Inf
## [1] "BATTING_3B"
## [1] Inf
## [1] "BATTING_HR"
## [1] Inf
## [1] "BATTING_BB"
## [1] 6.774763
## [1] "BATTING_SO"
## [1] 3.228496
## [1] "BASERUN_SB"
## [1] 1.87324
## [1] "BASERUN_CS"
## [1] 1.171899
## [1] "BATTING_HBP"
## [1] 1.237874
## [1] "PITCHING_H"
## [1] 3.532532
## [1] "PITCHING_HR"
## [1] 29.0385
## [1] "PITCHING_BB"
## [1] 6.31173
## [1] "PITCHING_SO"
## [1] 2.93079
## [1] "FIELDING_E"
## [1] 4.304486
## [1] "FIELDING_DP"
## [1] 1.374294
## Warning in summary.lm(model): essentially perfect fit: summary may be
## unreliable
## [1] "BATTING_1B"
## [1] Inf

#Stepwise Regression

## 
## Call:
## lm(formula = poly.call[2], data = train_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -42.830  -7.556   0.164   7.283  57.160 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      -5.415e+01  1.868e+01  -2.900  0.00379 ** 
## BATTING_2B        8.417e-01  1.423e-01   5.915 4.06e-09 ***
## BATTING_3B       -3.134e-01  2.191e-01  -1.430  0.15278    
## PITCHING_H        2.497e-02  4.318e-03   5.784 8.80e-09 ***
## PITCHING_HR      -1.370e-01  8.782e-02  -1.560  0.11893    
## PITCHING_BB       3.619e-01  5.721e-02   6.326 3.27e-10 ***
## PITCHING_SO      -1.676e-02  3.168e-03  -5.289 1.40e-07 ***
## FIELDING_E       -2.072e-01  2.623e-02  -7.900 5.23e-15 ***
## I(BATTING_2B^2)  -2.457e-03  4.135e-04  -5.942 3.46e-09 ***
## I(BATTING_3B^2)   1.146e-02  4.281e-03   2.678  0.00748 ** 
## I(BATTING_BB^2)  -7.817e-04  1.353e-04  -5.778 9.14e-09 ***
## I(BATTING_SO^2)   6.141e-05  7.866e-06   7.807 1.07e-14 ***
## I(BASERUN_SB^2)   4.044e-04  8.483e-05   4.767 2.04e-06 ***
## I(BASERUN_CS^2)  -1.102e-03  3.738e-04  -2.948  0.00325 ** 
## I(PITCHING_H^2)  -5.094e-06  6.362e-07  -8.007 2.28e-15 ***
## I(PITCHING_HR^2)  2.366e-03  1.142e-03   2.072  0.03842 *  
## I(PITCHING_BB^2) -5.812e-04  9.382e-05  -6.195 7.46e-10 ***
## I(FIELDING_E^2)   3.527e-04  6.322e-05   5.579 2.85e-08 ***
## I(BATTING_1B^2)   1.726e-05  2.087e-06   8.270 2.84e-16 ***
## I(BATTING_3B^3)  -9.213e-05  3.217e-05  -2.864  0.00424 ** 
## I(BATTING_BB^3)   1.749e-06  2.994e-07   5.842 6.28e-09 ***
## I(BATTING_SO^3)  -4.232e-08  5.704e-09  -7.420 1.92e-13 ***
## I(BASERUN_SB^3)  -1.089e-06  3.483e-07  -3.128  0.00180 ** 
## I(BASERUN_CS^3)   5.896e-06  2.157e-06   2.733  0.00634 ** 
## I(PITCHING_H^3)   3.186e-10  3.902e-11   8.164 6.65e-16 ***
## I(PITCHING_HR^3) -1.006e-05  5.630e-06  -1.787  0.07410 .  
## I(PITCHING_BB^3)  3.517e-07  6.088e-08   5.776 9.23e-09 ***
## I(PITCHING_SO^3)  3.523e-10  2.124e-10   1.658  0.09748 .  
## I(FIELDING_E^3)  -2.685e-07  5.688e-08  -4.721 2.56e-06 ***
## I(FIELDING_DP^3) -8.793e-06  1.851e-06  -4.751 2.21e-06 ***
## I(BATTING_2B^4)   5.703e-09  9.815e-10   5.811 7.53e-09 ***
## I(BATTING_3B^4)   2.232e-07  7.876e-08   2.833  0.00467 ** 
## I(BATTING_BB^4)  -1.014e-09  1.899e-10  -5.341 1.06e-07 ***
## I(BASERUN_SB^4)   8.927e-10  3.627e-10   2.461  0.01395 *  
## I(PITCHING_H^4)  -5.817e-15  7.401e-16  -7.861 7.09e-15 ***
## I(PITCHING_HR^4)  1.373e-08  9.154e-09   1.500  0.13389    
## I(PITCHING_BB^4) -6.801e-11  1.307e-11  -5.205 2.20e-07 ***
## I(FIELDING_E^4)   6.626e-11  1.694e-11   3.911 9.58e-05 ***
## I(FIELDING_DP^4)  3.539e-08  8.693e-09   4.072 4.90e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 11.75 on 1556 degrees of freedom
## Multiple R-squared:  0.4613, Adjusted R-squared:  0.4482 
## F-statistic: 35.07 on 38 and 1556 DF,  p-value: < 2.2e-16

#Model Results on the Hold Out Dataset

## [1] 0.6128171

Critical Thinking Group 3

2019-09-24